x86, vmx: Enable EPT (Extended PageTable) support on new Intel processors.

author Keir Fraser <keir.fraser@citrix.com>

Wed, 9 Apr 2008 10:30:32 +0000 (11:30 +0100)

committer Keir Fraser <keir.fraser@citrix.com>

Wed, 9 Apr 2008 10:30:32 +0000 (11:30 +0100)
author Keir Fraser <keir.fraser@citrix.com>
Wed, 9 Apr 2008 10:30:32 +0000 (11:30 +0100)
committer Keir Fraser <keir.fraser@citrix.com>
Wed, 9 Apr 2008 10:30:32 +0000 (11:30 +0100)
diff --git a/tools/libxc/xc_hvm_build.c b/tools/libxc/xc_hvm_build.c

index 296648a436374f6d25e08e6b3e6af16847e0fd24..2dd1d52f011f91856cc55d526b4c35a648447e81 100644 (file)
--- a/tools/libxc/xc_hvm_build.c
+++ b/tools/libxc/xc_hvm_build.c
@@ -21,6 +21,13 @@
  
  #define SCRATCH_PFN 0xFFFFF
  
+#define SPECIALPAGE_GUARD    0
+#define SPECIALPAGE_BUFIOREQ 1
+#define SPECIALPAGE_XENSTORE 2
+#define SPECIALPAGE_IOREQ    3
+#define SPECIALPAGE_IDENT_PT 4
+#define NR_SPECIAL_PAGES     5
+
  static void build_e820map(void *e820_page, unsigned long long mem_size)
  {
      struct e820entry *e820entry =
@@ -77,21 +84,16 @@ static void build_e820map(void *e820_page, unsigned long long mem_size)
      e820entry[nr_map].type = E820_RESERVED;
      nr_map++;
  
-    /*
-     * Low RAM goes here. Remove 4 pages for: ioreq, bufioreq, and xenstore.
-     *  1. Guard page.
-     *  2. Buffered ioreq.
-     *  3. Xenstore.
-     *  4. Normal ioreq.
-     */
+    /* Low RAM goes here. Reserve space for special pages. */
      e820entry[nr_map].addr = 0x100000;
-    e820entry[nr_map].size = mem_size - 0x100000 - PAGE_SIZE * 4;
+    e820entry[nr_map].size = (mem_size - 0x100000 -
+                              PAGE_SIZE * NR_SPECIAL_PAGES);
      e820entry[nr_map].type = E820_RAM;
      nr_map++;
  
-    /* Explicitly reserve space for special pages. */
-    e820entry[nr_map].addr = mem_size - PAGE_SIZE * 3;
-    e820entry[nr_map].size = PAGE_SIZE * 3;
+    /* Explicitly reserve space for special pages (excluding guard page). */
+    e820entry[nr_map].addr = mem_size - PAGE_SIZE * (NR_SPECIAL_PAGES - 1);
+    e820entry[nr_map].size = PAGE_SIZE * (NR_SPECIAL_PAGES - 1);
      e820entry[nr_map].type = E820_RESERVED;
      nr_map++;
  
@@ -156,10 +158,11 @@ static int setup_guest(int xc_handle,
  {
      xen_pfn_t *page_array = NULL;
      unsigned long i, nr_pages = (unsigned long)memsize << (20 - PAGE_SHIFT);
-    unsigned long shared_page_nr, entry_eip;
+    unsigned long special_page_nr, entry_eip;
      struct xen_add_to_physmap xatp;
      struct shared_info *shared_info;
      void *e820_page;
+    uint32_t *ident_pt;
      struct elf_binary elf;
      uint64_t v_start, v_end;
      int rc;
@@ -245,29 +248,46 @@ static int setup_guest(int xc_handle,
             sizeof(shared_info->evtchn_mask));
      munmap(shared_info, PAGE_SIZE);
  
-    if ( v_end > HVM_BELOW_4G_RAM_END )
-        shared_page_nr = (HVM_BELOW_4G_RAM_END >> PAGE_SHIFT) - 1;
-    else
-        shared_page_nr = (v_end >> PAGE_SHIFT) - 1;
+    special_page_nr = (((v_end > HVM_BELOW_4G_RAM_END)
+                        ? (HVM_BELOW_4G_RAM_END >> PAGE_SHIFT)
+                        : (v_end >> PAGE_SHIFT))
+                       - NR_SPECIAL_PAGES);
+
+    /* Paranoia: clean special pages. */
+    for ( i = 0; i < NR_SPECIAL_PAGES; i++ )
+        if ( xc_clear_domain_page(xc_handle, dom, special_page_nr + i) )
+            goto error_out;
  
      /* Free the guard page that separates low RAM from special pages. */
      rc = xc_domain_memory_decrease_reservation(
-            xc_handle, dom, 1, 0, &page_array[shared_page_nr-3]);
+        xc_handle, dom, 1, 0, &page_array[special_page_nr]);
      if ( rc != 0 )
      {
          PERROR("Could not deallocate guard page for HVM guest.\n");
          goto error_out;
      }
  
-    /* Paranoia: clean pages. */
-    if ( xc_clear_domain_page(xc_handle, dom, shared_page_nr) ||
-         xc_clear_domain_page(xc_handle, dom, shared_page_nr-1) ||
-         xc_clear_domain_page(xc_handle, dom, shared_page_nr-2) )
-        goto error_out;
+    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN,
+                     special_page_nr + SPECIALPAGE_XENSTORE);
+    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN,
+                     special_page_nr + SPECIALPAGE_BUFIOREQ);
+    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN,
+                     special_page_nr + SPECIALPAGE_IOREQ);
  
-    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_STORE_PFN, shared_page_nr-1);
-    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_BUFIOREQ_PFN, shared_page_nr-2);
-    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IOREQ_PFN, shared_page_nr);
+    /*
+     * Identity-map page table is required for running with CR0.PG=0 when
+     * using Intel EPT. Create a 32-bit non-PAE page directory of superpages.
+     */
+    if ( (ident_pt = xc_map_foreign_range(
+              xc_handle, dom, PAGE_SIZE, PROT_READ | PROT_WRITE,
+              special_page_nr + SPECIALPAGE_IDENT_PT)) == NULL )
+        goto error_out;
+    for ( i = 0; i < PAGE_SIZE / sizeof(*ident_pt); i++ )
+        ident_pt[i] = ((i << 22) | _PAGE_PRESENT | _PAGE_RW | _PAGE_USER |
+                       _PAGE_ACCESSED | _PAGE_DIRTY | _PAGE_PSE);
+    munmap(ident_pt, PAGE_SIZE);
+    xc_set_hvm_param(xc_handle, dom, HVM_PARAM_IDENT_PT,
+                     special_page_nr + SPECIALPAGE_IDENT_PT);
  
      /* Insert JMP <rel32> instruction at address 0x0 to reach entry point. */
      entry_eip = elf_uval(&elf, elf.ehdr, e_entry);
diff --git a/xen/arch/x86/domain.c b/xen/arch/x86/domain.c

index 4dd58809a84d2067bfa1be38b710a5375fca705f..ae1097416cd157ce23c875250e072c5fbd4bd1ea 100644 (file)
--- a/xen/arch/x86/domain.c
+++ b/xen/arch/x86/domain.c
@@ -503,13 +503,15 @@ int arch_domain_create(struct domain *d, unsigned int domcr_flags)
      HYPERVISOR_COMPAT_VIRT_START(d) = __HYPERVISOR_COMPAT_VIRT_START;
  #endif
  
-    paging_domain_init(d);
+    if ( (rc = paging_domain_init(d)) != 0 )
+        goto fail;
      paging_initialised = 1;
  
      if ( !is_idle_domain(d) )
      {
          d->arch.ioport_caps = 
              rangeset_new(d, "I/O Ports", RANGESETF_prettyprint_hex);
+        rc = -ENOMEM;
          if ( d->arch.ioport_caps == NULL )
              goto fail;
  
diff --git a/xen/arch/x86/hvm/hvm.c b/xen/arch/x86/hvm/hvm.c

index c3766bb318df5fc63824b1fd054fe044af68be2c..b0c674c67cbaa562904bd4cc42ebaeb9bb02ca1e 100644 (file)
--- a/xen/arch/x86/hvm/hvm.c
+++ b/xen/arch/x86/hvm/hvm.c
@@ -2212,6 +2212,33 @@ long do_hvm_op(unsigned long op, XEN_GUEST_HANDLE(void) arg)
                  if ( a.value > HVMPTM_one_missed_tick_pending )
                      goto param_fail;
                  break;
+            case HVM_PARAM_IDENT_PT:
+                rc = -EPERM;
+                if ( current->domain->domain_id != 0 )
+                    goto param_fail;
+
+                rc = -EINVAL;
+                if ( d->arch.hvm_domain.params[a.index] != 0 )
+                    goto param_fail;
+
+                if ( !paging_mode_hap(d) )
+                    break;
+
+                domain_pause(d);
+
+                /*
+                 * Update GUEST_CR3 in each VMCS to point at identity map.
+                 * All foreign updates to guest state must synchronise on
+                 * the domctl_lock.
+                 */
+                spin_lock(&domctl_lock);
+                d->arch.hvm_domain.params[a.index] = a.value;
+                for_each_vcpu ( d, v )
+                    paging_update_cr3(v);
+                spin_unlock(&domctl_lock);
+
+                domain_unpause(d);
+                break;
              }
              d->arch.hvm_domain.params[a.index] = a.value;
              rc = 0;
diff --git a/xen/arch/x86/hvm/vmx/vmcs.c b/xen/arch/x86/hvm/vmx/vmcs.c

index 725ae79c21a472d21bf3dc176a7944bfb5287e5d..8543718d0e826017febfc61cb1bf26abb169b052 100644 (file)
--- a/xen/arch/x86/hvm/vmx/vmcs.c
+++ b/xen/arch/x86/hvm/vmx/vmcs.c
@@ -84,14 +84,16 @@ static void vmx_init_vmcs_config(void)
  
      min = (CPU_BASED_HLT_EXITING |
             CPU_BASED_INVLPG_EXITING |
+           CPU_BASED_CR3_LOAD_EXITING |
+           CPU_BASED_CR3_STORE_EXITING |
             CPU_BASED_MONITOR_EXITING |
             CPU_BASED_MWAIT_EXITING |
             CPU_BASED_MOV_DR_EXITING |
             CPU_BASED_ACTIVATE_IO_BITMAP |
             CPU_BASED_USE_TSC_OFFSETING);
-    opt  = CPU_BASED_ACTIVATE_MSR_BITMAP;
-    opt |= CPU_BASED_TPR_SHADOW;
-    opt |= CPU_BASED_ACTIVATE_SECONDARY_CONTROLS;
+    opt = (CPU_BASED_ACTIVATE_MSR_BITMAP |
+           CPU_BASED_TPR_SHADOW |
+           CPU_BASED_ACTIVATE_SECONDARY_CONTROLS);
      _vmx_cpu_based_exec_control = adjust_vmx_controls(
          min, opt, MSR_IA32_VMX_PROCBASED_CTLS);
  #ifdef __x86_64__
@@ -107,11 +109,23 @@ static void vmx_init_vmcs_config(void)
      {
          min = 0;
          opt = (SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES |
-               SECONDARY_EXEC_WBINVD_EXITING);
+               SECONDARY_EXEC_WBINVD_EXITING |
+               SECONDARY_EXEC_ENABLE_EPT);
          _vmx_secondary_exec_control = adjust_vmx_controls(
              min, opt, MSR_IA32_VMX_PROCBASED_CTLS2);
      }
  
+    if ( _vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT )
+    {
+        /* To use EPT we expect to be able to clear certain intercepts. */
+        uint32_t must_be_one, must_be_zero;
+        rdmsr(MSR_IA32_VMX_PROCBASED_CTLS, must_be_one, must_be_zero);
+        if ( must_be_one & (CPU_BASED_INVLPG_EXITING |
+                            CPU_BASED_CR3_LOAD_EXITING |
+                            CPU_BASED_CR3_STORE_EXITING) )
+            _vmx_secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+    }
+
  #if defined(__i386__)
      /* If we can't virtualise APIC accesses, the TPR shadow is pointless. */
      if ( !(_vmx_secondary_exec_control &
@@ -301,6 +315,8 @@ int vmx_cpu_up(void)
          return 0;
      }
  
+    ept_sync_all();
+
      return 1;
  }
  
@@ -439,6 +455,7 @@ void vmx_disable_intercept_for_msr(struct vcpu *v, u32 msr)
  
  static int construct_vmcs(struct vcpu *v)
  {
+    struct domain *d = v->domain;
      uint16_t sysenter_cs;
      unsigned long sysenter_eip;
  
@@ -448,10 +465,25 @@ static int construct_vmcs(struct vcpu *v)
      __vmwrite(PIN_BASED_VM_EXEC_CONTROL, vmx_pin_based_exec_control);
      __vmwrite(VM_EXIT_CONTROLS, vmx_vmexit_control);
      __vmwrite(VM_ENTRY_CONTROLS, vmx_vmentry_control);
-    __vmwrite(CPU_BASED_VM_EXEC_CONTROL, vmx_cpu_based_exec_control);
+
      v->arch.hvm_vmx.exec_control = vmx_cpu_based_exec_control;
-    if ( vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS )
-        __vmwrite(SECONDARY_VM_EXEC_CONTROL, vmx_secondary_exec_control);
+    v->arch.hvm_vmx.secondary_exec_control = vmx_secondary_exec_control;
+
+    if ( paging_mode_hap(d) )
+    {
+        v->arch.hvm_vmx.exec_control &= ~(CPU_BASED_INVLPG_EXITING |
+                                          CPU_BASED_CR3_LOAD_EXITING |
+                                          CPU_BASED_CR3_STORE_EXITING);
+    }
+    else
+    {
+        v->arch.hvm_vmx.secondary_exec_control &= ~SECONDARY_EXEC_ENABLE_EPT;
+    }
+
+    __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
+    if ( cpu_has_vmx_secondary_exec_control )
+        __vmwrite(SECONDARY_VM_EXEC_CONTROL,
+                  v->arch.hvm_vmx.secondary_exec_control);
  
      /* MSR access bitmap. */
      if ( cpu_has_vmx_msr_bitmap )
@@ -570,9 +602,10 @@ static int construct_vmcs(struct vcpu *v)
      __vmwrite(VMCS_LINK_POINTER_HIGH, ~0UL);
  #endif
  
-    __vmwrite(EXCEPTION_BITMAP, (HVM_TRAP_MASK |
-                                 (1U << TRAP_page_fault) |
-                                 (1U << TRAP_no_device)));
+    __vmwrite(EXCEPTION_BITMAP,
+              HVM_TRAP_MASK
+              | (paging_mode_hap(d) ? 0 : (1U << TRAP_page_fault))
+              | (1U << TRAP_no_device));
  
      v->arch.hvm_vcpu.guest_cr[0] = X86_CR0_PE | X86_CR0_ET;
      hvm_update_guest_cr(v, 0);
@@ -587,6 +620,15 @@ static int construct_vmcs(struct vcpu *v)
          __vmwrite(TPR_THRESHOLD, 0);
      }
  
+    if ( paging_mode_hap(d) )
+    {
+        __vmwrite(EPT_POINTER, d->arch.hvm_domain.vmx.ept_control.eptp);
+#ifdef CONFIG_X86_PAE
+        __vmwrite(EPT_POINTER_HIGH,
+                  d->arch.hvm_domain.vmx.ept_control.eptp >> 32);
+#endif
+    }
+
      vmx_vmcs_exit(v);
  
      paging_update_paging_modes(v); /* will update HOST & GUEST_CR3 as reqd */
@@ -932,6 +974,8 @@ void vmcs_dump_vcpu(struct vcpu *v)
             (uint32_t)vmr(IDT_VECTORING_ERROR_CODE));
      printk("TPR Threshold = 0x%02x\n",
             (uint32_t)vmr(TPR_THRESHOLD));
+    printk("EPT pointer = 0x%08x%08x\n",
+           (uint32_t)vmr(EPT_POINTER_HIGH), (uint32_t)vmr(EPT_POINTER));
  
      vmx_vmcs_exit(v);
  }
diff --git a/xen/arch/x86/hvm/vmx/vmx.c b/xen/arch/x86/hvm/vmx/vmx.c

index 29dcb685037c27c726867fe8bade613e09f3e6a3..a88d11adc676c4cdc9ca40d4cdd6a182084cddb2 100644 (file)
--- a/xen/arch/x86/hvm/vmx/vmx.c
+++ b/xen/arch/x86/hvm/vmx/vmx.c
@@ -71,11 +71,17 @@ static void vmx_invlpg_intercept(unsigned long vaddr);
  
  static int vmx_domain_initialise(struct domain *d)
  {
+    d->arch.hvm_domain.vmx.ept_control.etmt = EPT_DEFAULT_MT;
+    d->arch.hvm_domain.vmx.ept_control.gaw  = EPT_DEFAULT_GAW;
+    d->arch.hvm_domain.vmx.ept_control.asr  =
+        pagetable_get_pfn(d->arch.phys_table);
+
      return vmx_alloc_vlapic_mapping(d);
  }
  
  static void vmx_domain_destroy(struct domain *d)
  {
+    ept_sync_domain(d);
      vmx_free_vlapic_mapping(d);
  }
  
@@ -492,20 +498,23 @@ static int vmx_restore_cr0_cr3(
      unsigned long mfn = 0;
      p2m_type_t p2mt;
  
-    if ( cr0 & X86_CR0_PG )
+    if ( paging_mode_shadow(v->domain) )
      {
-        mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
-        if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) )
+        if ( cr0 & X86_CR0_PG )
          {
-            gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%lx\n", cr3);
-            return -EINVAL;
+            mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
+            if ( !p2m_is_ram(p2mt) || !get_page(mfn_to_page(mfn), v->domain) )
+            {
+                gdprintk(XENLOG_ERR, "Invalid CR3 value=0x%lx\n", cr3);
+                return -EINVAL;
+            }
          }
-    }
  
-    if ( v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_PG )
-        put_page(pagetable_get_page(v->arch.guest_table));
+        if ( hvm_paging_enabled(v) )
+            put_page(pagetable_get_page(v->arch.guest_table));
  
-    v->arch.guest_table = pagetable_from_pfn(mfn);
+        v->arch.guest_table = pagetable_from_pfn(mfn);
+    }
  
      v->arch.hvm_vcpu.guest_cr[0] = cr0 | X86_CR0_ET;
      v->arch.hvm_vcpu.guest_cr[3] = cr3;
@@ -900,6 +909,56 @@ static void vmx_set_interrupt_shadow(struct vcpu *v, unsigned int intr_shadow)
      __vmwrite(GUEST_INTERRUPTIBILITY_INFO, intr_shadow);
  }
  
+static void vmx_load_pdptrs(struct vcpu *v)
+{
+    unsigned long cr3 = v->arch.hvm_vcpu.guest_cr[3], mfn;
+    uint64_t *guest_pdptrs;
+    p2m_type_t p2mt;
+    char *p;
+
+    /* EPT needs to load PDPTRS into VMCS for PAE. */
+    if ( !hvm_pae_enabled(v) || (v->arch.hvm_vcpu.guest_efer & EFER_LMA) )
+        return;
+
+    if ( cr3 & 0x1fUL )
+        goto crash;
+
+    mfn = mfn_x(gfn_to_mfn(v->domain, cr3 >> PAGE_SHIFT, &p2mt));
+    if ( !p2m_is_ram(p2mt) )
+        goto crash;
+
+    p = map_domain_page(mfn);
+
+    guest_pdptrs = (uint64_t *)(p + (cr3 & ~PAGE_MASK));
+
+    /*
+     * We do not check the PDPTRs for validity. The CPU will do this during
+     * vm entry, and we can handle the failure there and crash the guest.
+     * The only thing we could do better here is #GP instead.
+     */
+
+    vmx_vmcs_enter(v);
+
+    __vmwrite(GUEST_PDPTR0, guest_pdptrs[0]);
+    __vmwrite(GUEST_PDPTR1, guest_pdptrs[1]);
+    __vmwrite(GUEST_PDPTR2, guest_pdptrs[2]);
+    __vmwrite(GUEST_PDPTR3, guest_pdptrs[3]);
+#ifdef CONFIG_X86_PAE
+    __vmwrite(GUEST_PDPTR0_HIGH, guest_pdptrs[0] >> 32);
+    __vmwrite(GUEST_PDPTR1_HIGH, guest_pdptrs[1] >> 32);
+    __vmwrite(GUEST_PDPTR2_HIGH, guest_pdptrs[2] >> 32);
+    __vmwrite(GUEST_PDPTR3_HIGH, guest_pdptrs[3] >> 32);
+#endif
+
+    vmx_vmcs_exit(v);
+
+    unmap_domain_page(p);
+    return;
+
+ crash:
+    domain_crash(v->domain);
+}
+
  static void vmx_update_host_cr3(struct vcpu *v)
  {
      vmx_vmcs_enter(v);
@@ -915,7 +974,24 @@ static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
      {
      case 0: {
          unsigned long hw_cr0_mask =
-            X86_CR0_NE | X86_CR0_PG | X86_CR0_WP | X86_CR0_PE;
+            X86_CR0_NE | X86_CR0_PG | X86_CR0_PE;
+
+        if ( paging_mode_shadow(v->domain) )
+           hw_cr0_mask |= X86_CR0_WP;
+
+        if ( paging_mode_hap(v->domain) )
+        {
+            /* We manage GUEST_CR3 when guest CR0.PE is zero. */
+            uint32_t cr3_ctls = (CPU_BASED_CR3_LOAD_EXITING |
+                                 CPU_BASED_CR3_STORE_EXITING);
+            v->arch.hvm_vmx.exec_control &= ~cr3_ctls;
+            if ( !hvm_paging_enabled(v) )
+                v->arch.hvm_vmx.exec_control |= cr3_ctls;
+            __vmwrite(CPU_BASED_VM_EXEC_CONTROL, v->arch.hvm_vmx.exec_control);
+
+            /* Changing CR0.PE can change some bits in real CR4. */
+            vmx_update_guest_cr(v, 4);
+        }
  
          if ( !(v->arch.hvm_vcpu.guest_cr[0] & X86_CR0_TS) )
          {
@@ -939,11 +1015,26 @@ static void vmx_update_guest_cr(struct vcpu *v, unsigned int cr)
          /* CR2 is updated in exit stub. */
          break;
      case 3:
+        if ( paging_mode_hap(v->domain) )
+        {
+            if ( !hvm_paging_enabled(v) )
+                v->arch.hvm_vcpu.hw_cr[3] =
+                    v->domain->arch.hvm_domain.params[HVM_PARAM_IDENT_PT];
+            vmx_load_pdptrs(v);
+        }
+ 
          __vmwrite(GUEST_CR3, v->arch.hvm_vcpu.hw_cr[3]);
          break;
      case 4:
-        v->arch.hvm_vcpu.hw_cr[4] =
-            v->arch.hvm_vcpu.guest_cr[4] | HVM_CR4_HOST_MASK;
+        v->arch.hvm_vcpu.hw_cr[4] = HVM_CR4_HOST_MASK;
+        if ( paging_mode_hap(v->domain) )
+            v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
+        v->arch.hvm_vcpu.hw_cr[4] |= v->arch.hvm_vcpu.guest_cr[4];
+        if ( paging_mode_hap(v->domain) && !hvm_paging_enabled(v) )
+        {
+            v->arch.hvm_vcpu.hw_cr[4] |= X86_CR4_PSE;
+            v->arch.hvm_vcpu.hw_cr[4] &= ~X86_CR4_PAE;
+        }
          __vmwrite(GUEST_CR4, v->arch.hvm_vcpu.hw_cr[4]);
          __vmwrite(CR4_READ_SHADOW, v->arch.hvm_vcpu.guest_cr[4]);
          break;
@@ -983,7 +1074,18 @@ static void vmx_flush_guest_tlbs(void)
       * because VMRESUME will flush it for us. */
  }
  
+static void __ept_sync_domain(void *info)
+{
+    struct domain *d = info;
+    __invept(1, d->arch.hvm_domain.vmx.ept_control.eptp, 0);
+}
  
+void ept_sync_domain(struct domain *d)
+{
+    /* Only if using EPT and this domain has some VCPUs to dirty. */
+    if ( d->arch.hvm_domain.hap_enabled && d->vcpu[0] )
+        on_each_cpu(__ept_sync_domain, d, 1, 1);
+}
  
  static void __vmx_inject_exception(
      struct vcpu *v, int trap, int type, int error_code)
@@ -1133,6 +1235,12 @@ void start_vmx(void)
          return;
      }
  
+    if ( cpu_has_vmx_ept )
+    {
+        printk("VMX: EPT is available.\n");
+        vmx_function_table.hap_supported = 1;
+    }
+
      setup_vmcs_dump();
  
      hvm_enable(&vmx_function_table);
@@ -1635,14 +1743,14 @@ static int vmx_alloc_vlapic_mapping(struct domain *d)
      share_xen_page_with_guest(virt_to_page(apic_va), d, XENSHARE_writable);
      set_mmio_p2m_entry(
          d, paddr_to_pfn(APIC_DEFAULT_PHYS_BASE), _mfn(virt_to_mfn(apic_va)));
-    d->arch.hvm_domain.vmx_apic_access_mfn = virt_to_mfn(apic_va);
+    d->arch.hvm_domain.vmx.apic_access_mfn = virt_to_mfn(apic_va);
  
      return 0;
  }
  
  static void vmx_free_vlapic_mapping(struct domain *d)
  {
-    unsigned long mfn = d->arch.hvm_domain.vmx_apic_access_mfn;
+    unsigned long mfn = d->arch.hvm_domain.vmx.apic_access_mfn;
      if ( mfn != 0 )
          free_xenheap_page(mfn_to_virt(mfn));
  }
@@ -1655,7 +1763,7 @@ static void vmx_install_vlapic_mapping(struct vcpu *v)
          return;
  
      virt_page_ma = page_to_maddr(vcpu_vlapic(v)->regs_page);
-    apic_page_ma = v->domain->arch.hvm_domain.vmx_apic_access_mfn;
+    apic_page_ma = v->domain->arch.hvm_domain.vmx.apic_access_mfn;
      apic_page_ma <<= PAGE_SHIFT;
  
      vmx_vmcs_enter(v);
@@ -1900,6 +2008,17 @@ static void vmx_wbinvd_intercept(void)
          wbinvd();
  }
  
+static void ept_handle_violation(unsigned long qualification, paddr_t gpa)
+{
+    if ( unlikely(((qualification >> 7) & 0x3) != 0x3) )
+    {
+        domain_crash(current->domain);
+        return;
+    }
+
+    handle_mmio();
+}
+
  static void vmx_failed_vmentry(unsigned int exit_reason,
                                 struct cpu_user_regs *regs)
  {
@@ -1939,6 +2058,10 @@ asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
      unsigned long exit_qualification, inst_len = 0;
      struct vcpu *v = current;
  
+    if ( paging_mode_hap(v->domain) && hvm_paging_enabled(v) )
+        v->arch.hvm_vcpu.guest_cr[3] = v->arch.hvm_vcpu.hw_cr[3] =
+            __vmread(GUEST_CR3);
+
      exit_reason = __vmread(VM_EXIT_REASON);
  
      hvmtrace_vmexit(v, regs->eip, exit_reason);
@@ -2171,6 +2294,17 @@ asmlinkage void vmx_vmexit_handler(struct cpu_user_regs *regs)
          break;
      }
  
+    case EXIT_REASON_EPT_VIOLATION:
+    {
+        paddr_t gpa = __vmread(GUEST_PHYSICAL_ADDRESS);
+#ifdef CONFIG_X86_PAE
+        gpa |= (paddr_t)__vmread(GUEST_PHYSICAL_ADDRESS_HIGH) << 32;
+#endif
+        exit_qualification = __vmread(EXIT_QUALIFICATION);
+        ept_handle_violation(exit_qualification, gpa);
+        break;
+    }
+
      default:
      exit_and_crash:
          gdprintk(XENLOG_ERR, "Bad vmexit (reason %x)\n", exit_reason);
diff --git a/xen/arch/x86/mm.c b/xen/arch/x86/mm.c

index 766b4bf05771c8e18b5307e145b39b9b83a81416..2376e941de5755ef18b9ad62a449871219e2966a 100644 (file)
--- a/xen/arch/x86/mm.c
+++ b/xen/arch/x86/mm.c
@@ -299,7 +299,7 @@ int memory_is_conventional_ram(paddr_t p)
  unsigned long domain_get_maximum_gpfn(struct domain *d)
  {
      if ( is_hvm_domain(d) )
-        return d->arch.p2m.max_mapped_pfn;
+        return d->arch.p2m->max_mapped_pfn;
      /* NB. PV guests specify nr_pfns rather than max_pfn so we adjust here. */
      return arch_get_max_pfn(d) - 1;
  }
diff --git a/xen/arch/x86/mm/hap/Makefile b/xen/arch/x86/mm/hap/Makefile

index 160e5f36bfbfcc9b15aade0544e9c251cd442a91..64cb72786ef761a01795e3e447be79522c7e40c5 100644 (file)
--- a/xen/arch/x86/mm/hap/Makefile
+++ b/xen/arch/x86/mm/hap/Makefile
@@ -2,6 +2,7 @@ obj-y += hap.o
  obj-y += guest_walk_2level.o
  obj-y += guest_walk_3level.o
  obj-y += guest_walk_4level.o
+obj-y += p2m-ept.o
  
  guest_levels  = $(subst level,,$(filter %level,$(subst ., ,$(subst _, ,$(1)))))
  guest_walk_defns = -DGUEST_PAGING_LEVELS=$(call guest_levels,$(1))
diff --git a/xen/arch/x86/mm/hap/p2m-ept.c b/xen/arch/x86/mm/hap/p2m-ept.c

new file mode 100644 (file)

index 0000000..847643b
--- /dev/null
+++ b/xen/arch/x86/mm/hap/p2m-ept.c
@@ -0,0 +1,187 @@
+/*
+ * ept-p2m.c: use the EPT page table as p2m
+ * Copyright (c) 2007, Intel Corporation.
+ *
+ * This program is free software; you can redistribute it and/or modify it
+ * under the terms and conditions of the GNU General Public License,
+ * version 2, as published by the Free Software Foundation.
+ *
+ * This program is distributed in the hope it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE.  See the GNU General Public License for
+ * more details.
+ *
+ * You should have received a copy of the GNU General Public License along with
+ * this program; if not, write to the Free Software Foundation, Inc., 59 Temple
+ * Place - Suite 330, Boston, MA 02111-1307 USA.
+ */
+
+#include <xen/config.h>
+#include <xen/domain_page.h>
+#include <xen/sched.h>
+#include <asm/current.h>
+#include <asm/types.h>
+#include <asm/domain.h>
+#include <asm/p2m.h>
+#include <asm/hvm/vmx/vmx.h>
+#include <xen/iommu.h>
+
+static int ept_next_level(struct domain *d, bool_t read_only,
+                          ept_entry_t **table, unsigned long *gfn_remainder,
+                          u32 shift)
+{
+    ept_entry_t *ept_entry, *next;
+    u32 index;
+
+    index = *gfn_remainder >> shift;
+    *gfn_remainder &= (1UL << shift) - 1;
+
+    ept_entry = (*table) + index;
+
+    if ( !(ept_entry->epte & 0x7) )
+    {
+        struct page_info *pg;
+
+        if ( read_only )
+            return 0;
+
+        pg = d->arch.p2m->alloc_page(d);
+        if ( pg == NULL )
+            return 0;
+
+        pg->count_info = 1;
+        pg->u.inuse.type_info = 1 | PGT_validated;
+        list_add_tail(&pg->list, &d->arch.p2m->pages);
+
+        ept_entry->emt = 0;
+        ept_entry->sp_avail = 0;
+        ept_entry->avail1 = 0;
+        ept_entry->mfn = page_to_mfn(pg);
+        ept_entry->rsvd = 0;
+        ept_entry->avail2 = 0;
+        /* last step */
+        ept_entry->r = ept_entry->w = ept_entry->x = 1;
+    }
+
+    next = map_domain_page(ept_entry->mfn);
+    unmap_domain_page(*table);
+    *table = next;
+
+    return 1;
+}
+
+static int
+ept_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
+{
+    ept_entry_t *table =
+        map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
+    unsigned long gfn_remainder = gfn;
+    ept_entry_t *ept_entry = NULL;
+    u32 index;
+    int i, rv = 0;
+
+    /* Should check if gfn obeys GAW here */
+
+    for ( i = EPT_DEFAULT_GAW; i > 0; i-- )
+        if ( !ept_next_level(d, 0, &table, &gfn_remainder,
+                             i * EPT_TABLE_ORDER) )
+            goto out;
+
+    index = gfn_remainder;
+    ept_entry = table + index;
+
+    if ( mfn_valid(mfn_x(mfn)) || (p2mt == p2m_mmio_direct) )
+    {
+        /* Track the highest gfn for which we have ever had a valid mapping */
+        if ( gfn > d->arch.p2m->max_mapped_pfn )
+            d->arch.p2m->max_mapped_pfn = gfn;
+
+        ept_entry->emt = EPT_DEFAULT_MT;
+        ept_entry->sp_avail = 0;
+        ept_entry->avail1 = p2mt;
+        ept_entry->mfn = mfn_x(mfn);
+        ept_entry->rsvd = 0;
+        ept_entry->avail2 = 0;
+        /* last step */
+        ept_entry->r = ept_entry->w = ept_entry->x = 1;
+    }
+    else
+        ept_entry->epte = 0;
+
+    /* Success */
+    rv = 1;
+
+ out:
+    unmap_domain_page(table);
+
+    ept_sync_domain(d);
+
+    /* If p2m table is shared with vtd page-table. */
+    if ( iommu_enabled && is_hvm_domain(d) && (p2mt == p2m_mmio_direct) )
+        iommu_flush(d, gfn, (u64*)ept_entry);
+
+    return rv;
+}
+
+/* Read ept p2m entries */
+static mfn_t ept_get_entry(struct domain *d, unsigned long gfn, p2m_type_t *t)
+{
+    ept_entry_t *table =
+        map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
+    unsigned long gfn_remainder = gfn;
+    ept_entry_t *ept_entry;
+    u32 index;
+    int i;
+    mfn_t mfn = _mfn(INVALID_MFN);
+
+    *t = p2m_mmio_dm;
+
+    /* This pfn is higher than the highest the p2m map currently holds */
+    if ( gfn > d->arch.p2m->max_mapped_pfn )
+        goto out;
+
+    /* Should check if gfn obeys GAW here. */
+
+    for ( i = EPT_DEFAULT_GAW; i > 0; i-- )
+        if ( !ept_next_level(d, 1, &table, &gfn_remainder,
+                             i * EPT_TABLE_ORDER) )
+            goto out;
+
+    index = gfn_remainder;
+    ept_entry = table + index;
+
+    if ( (ept_entry->epte & 0x7) == 0x7 )
+    {
+        if ( ept_entry->avail1 != p2m_invalid )
+        {
+            *t = ept_entry->avail1;
+            mfn = _mfn(ept_entry->mfn);
+        }
+    }
+
+ out:
+    unmap_domain_page(table);
+    return mfn;
+}
+
+static mfn_t ept_get_entry_current(unsigned long gfn, p2m_type_t *t)
+{
+    return ept_get_entry(current->domain, gfn, t);
+}
+
+void ept_p2m_init(struct domain *d)
+{
+    d->arch.p2m->set_entry = ept_set_entry;
+    d->arch.p2m->get_entry = ept_get_entry;
+    d->arch.p2m->get_entry_current = ept_get_entry_current;
+}
+
+/*
+ * Local variables:
+ * mode: C
+ * c-set-style: "BSD"
+ * c-basic-offset: 4
+ * tab-width: 4
+ * indent-tabs-mode: nil
+ * End:
+ */
diff --git a/xen/arch/x86/mm/p2m.c b/xen/arch/x86/mm/p2m.c

index e8298fb3bd0079d10250d413ca42aa4deeac46ff..ea7fd4506f1d5eb6ed87e8537310b4d1448d428e 100644 (file)
--- a/xen/arch/x86/mm/p2m.c
+++ b/xen/arch/x86/mm/p2m.c
@@ -27,6 +27,7 @@
  #include <asm/page.h>
  #include <asm/paging.h>
  #include <asm/p2m.h>
+#include <asm/hvm/vmx/vmx.h> /* ept_p2m_init() */
  #include <xen/iommu.h>
  
  /* Debugging and auditing of the P2M code? */
@@ -41,37 +42,36 @@
   * Locking discipline: always acquire this lock before the shadow or HAP one
   */
  
-#define p2m_lock_init(_d)                            \
-    do {                                             \
-        spin_lock_init(&(_d)->arch.p2m.lock);        \
-        (_d)->arch.p2m.locker = -1;                  \
-        (_d)->arch.p2m.locker_function = "nobody";   \
+#define p2m_lock_init(_p2m)                     \
+    do {                                        \
+        spin_lock_init(&(_p2m)->lock);          \
+        (_p2m)->locker = -1;                    \
+        (_p2m)->locker_function = "nobody";     \
      } while (0)
  
-#define p2m_lock(_d)                                                \
-    do {                                                            \
-        if ( unlikely((_d)->arch.p2m.locker == current->processor) )\
-        {                                                           \
-            printk("Error: p2m lock held by %s\n",                  \
-                   (_d)->arch.p2m.locker_function);                 \
-            BUG();                                                  \
-        }                                                           \
-        spin_lock(&(_d)->arch.p2m.lock);                            \
-        ASSERT((_d)->arch.p2m.locker == -1);                        \
-        (_d)->arch.p2m.locker = current->processor;                 \
-        (_d)->arch.p2m.locker_function = __func__;                  \
+#define p2m_lock(_p2m)                                          \
+    do {                                                        \
+        if ( unlikely((_p2m)->locker == current->processor) )   \
+        {                                                       \
+            printk("Error: p2m lock held by %s\n",              \
+                   (_p2m)->locker_function);                    \
+            BUG();                                              \
+        }                                                       \
+        spin_lock(&(_p2m)->lock);                               \
+        ASSERT((_p2m)->locker == -1);                           \
+        (_p2m)->locker = current->processor;                    \
+        (_p2m)->locker_function = __func__;                     \
      } while (0)
  
-#define p2m_unlock(_d)                                              \
-    do {                                                            \
-        ASSERT((_d)->arch.p2m.locker == current->processor); \
-        (_d)->arch.p2m.locker = -1;                          \
-        (_d)->arch.p2m.locker_function = "nobody";           \
-        spin_unlock(&(_d)->arch.p2m.lock);                   \
+#define p2m_unlock(_p2m)                                \
+    do {                                                \
+        ASSERT((_p2m)->locker == current->processor);   \
+        (_p2m)->locker = -1;                            \
+        (_p2m)->locker_function = "nobody";             \
+        spin_unlock(&(_p2m)->lock);                     \
      } while (0)
  
  
-
  /* Printouts */
  #define P2M_PRINTK(_f, _a...)                                \
      debugtrace_printk("p2m: %s(): " _f, __func__, ##_a)
@@ -152,7 +152,7 @@ p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
      l1_pgentry_t *p2m_entry;
      l1_pgentry_t new_entry;
      void *next;
-    ASSERT(d->arch.p2m.alloc_page);
+    ASSERT(d->arch.p2m->alloc_page);
  
      if ( !(p2m_entry = p2m_find_entry(*table, gfn_remainder, gfn,
                                        shift, max)) )
@@ -160,10 +160,10 @@ p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
  
      if ( !(l1e_get_flags(*p2m_entry) & _PAGE_PRESENT) )
      {
-        struct page_info *pg = d->arch.p2m.alloc_page(d);
+        struct page_info *pg = d->arch.p2m->alloc_page(d);
          if ( pg == NULL )
              return 0;
-        list_add_tail(&pg->list, &d->arch.p2m.pages);
+        list_add_tail(&pg->list, &d->arch.p2m->pages);
          pg->u.inuse.type_info = type | 1 | PGT_validated;
          pg->count_info = 1;
  
@@ -202,7 +202,7 @@ p2m_next_level(struct domain *d, mfn_t *table_mfn, void **table,
  
  // Returns 0 on error (out of memory)
  static int
-set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
+p2m_set_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
  {
      // XXX -- this might be able to be faster iff current->domain == d
      mfn_t table_mfn = pagetable_get_mfn(d->arch.phys_table);
@@ -244,8 +244,8 @@ set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
      ASSERT(p2m_entry);
  
      /* Track the highest gfn for which we have ever had a valid mapping */
-    if ( mfn_valid(mfn) && (gfn > d->arch.p2m.max_mapped_pfn) )
-        d->arch.p2m.max_mapped_pfn = gfn;
+    if ( mfn_valid(mfn) && (gfn > d->arch.p2m->max_mapped_pfn) )
+        d->arch.p2m->max_mapped_pfn = gfn;
  
      if ( mfn_valid(mfn) || (p2mt == p2m_mmio_direct) )
          entry_content = l1e_from_pfn(mfn_x(mfn), p2m_type_to_flags(p2mt));
@@ -279,14 +279,158 @@ set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
      return rv;
  }
  
+static mfn_t
+p2m_gfn_to_mfn(struct domain *d, unsigned long gfn, p2m_type_t *t)
+{
+    mfn_t mfn;
+    paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
+    l2_pgentry_t *l2e;
+    l1_pgentry_t *l1e;
+
+    ASSERT(paging_mode_translate(d));
+
+    /* XXX This is for compatibility with the old model, where anything not 
+     * XXX marked as RAM was considered to be emulated MMIO space.
+     * XXX Once we start explicitly registering MMIO regions in the p2m 
+     * XXX we will return p2m_invalid for unmapped gfns */
+    *t = p2m_mmio_dm;
+
+    mfn = pagetable_get_mfn(d->arch.phys_table);
+
+    if ( gfn > d->arch.p2m->max_mapped_pfn )
+        /* This pfn is higher than the highest the p2m map currently holds */
+        return _mfn(INVALID_MFN);
+
+#if CONFIG_PAGING_LEVELS >= 4
+    {
+        l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn));
+        l4e += l4_table_offset(addr);
+        if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
+        {
+            unmap_domain_page(l4e);
+            return _mfn(INVALID_MFN);
+        }
+        mfn = _mfn(l4e_get_pfn(*l4e));
+        unmap_domain_page(l4e);
+    }
+#endif
+#if CONFIG_PAGING_LEVELS >= 3
+    {
+        l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn));
+#if CONFIG_PAGING_LEVELS == 3
+        /* On PAE hosts the p2m has eight l3 entries, not four (see
+         * shadow_set_p2m_entry()) so we can't use l3_table_offset.
+         * Instead, just count the number of l3es from zero.  It's safe
+         * to do this because we already checked that the gfn is within
+         * the bounds of the p2m. */
+        l3e += (addr >> L3_PAGETABLE_SHIFT);
+#else
+        l3e += l3_table_offset(addr);
+#endif
+        if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
+        {
+            unmap_domain_page(l3e);
+            return _mfn(INVALID_MFN);
+        }
+        mfn = _mfn(l3e_get_pfn(*l3e));
+        unmap_domain_page(l3e);
+    }
+#endif
+
+    l2e = map_domain_page(mfn_x(mfn));
+    l2e += l2_table_offset(addr);
+    if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
+    {
+        unmap_domain_page(l2e);
+        return _mfn(INVALID_MFN);
+    }
+    mfn = _mfn(l2e_get_pfn(*l2e));
+    unmap_domain_page(l2e);
+
+    l1e = map_domain_page(mfn_x(mfn));
+    l1e += l1_table_offset(addr);
+    if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
+    {
+        unmap_domain_page(l1e);
+        return _mfn(INVALID_MFN);
+    }
+    mfn = _mfn(l1e_get_pfn(*l1e));
+    *t = p2m_flags_to_type(l1e_get_flags(*l1e));
+    unmap_domain_page(l1e);
+
+    ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
+    return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
+}
+
+/* Read the current domain's p2m table (through the linear mapping). */
+static mfn_t p2m_gfn_to_mfn_current(unsigned long gfn, p2m_type_t *t)
+{
+    mfn_t mfn = _mfn(INVALID_MFN);
+    p2m_type_t p2mt = p2m_mmio_dm;
+    /* XXX This is for compatibility with the old model, where anything not 
+     * XXX marked as RAM was considered to be emulated MMIO space.
+     * XXX Once we start explicitly registering MMIO regions in the p2m 
+     * XXX we will return p2m_invalid for unmapped gfns */
+
+    if ( gfn <= current->domain->arch.p2m->max_mapped_pfn )
+    {
+        l1_pgentry_t l1e = l1e_empty();
+        int ret;
+
+        ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) 
+               / sizeof(l1_pgentry_t));
+
+        /* Need to __copy_from_user because the p2m is sparse and this
+         * part might not exist */
+        ret = __copy_from_user(&l1e,
+                               &phys_to_machine_mapping[gfn],
+                               sizeof(l1e));
+
+        if ( ret == 0 ) {
+            p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
+            ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
+            if ( p2m_is_valid(p2mt) )
+                mfn = _mfn(l1e_get_pfn(l1e));
+            else 
+                /* XXX see above */
+                p2mt = p2m_mmio_dm;
+        }
+    }
+
+    *t = p2mt;
+    return mfn;
+}
  
  /* Init the datastructures for later use by the p2m code */
-void p2m_init(struct domain *d)
+int p2m_init(struct domain *d)
  {
-    p2m_lock_init(d);
-    INIT_LIST_HEAD(&d->arch.p2m.pages);
+    struct p2m_domain *p2m;
+
+    p2m = xmalloc(struct p2m_domain);
+    if ( p2m == NULL )
+        return -ENOMEM;
+
+    d->arch.p2m = p2m;
+
+    p2m_lock_init(p2m);
+    INIT_LIST_HEAD(&p2m->pages);
+
+    p2m->set_entry = p2m_set_entry;
+    p2m->get_entry = p2m_gfn_to_mfn;
+    p2m->get_entry_current = p2m_gfn_to_mfn_current;
+
+    if ( is_hvm_domain(d) && d->arch.hvm_domain.hap_enabled &&
+         (boot_cpu_data.x86_vendor == X86_VENDOR_INTEL) )
+        ept_p2m_init(d);
+
+    return 0;
  }
  
+static inline
+int set_p2m_entry(struct domain *d, unsigned long gfn, mfn_t mfn, p2m_type_t p2mt)
+{
+    return d->arch.p2m->set_entry(d, gfn, mfn, p2mt);
+}
  
  // Allocate a new p2m table for a domain.
  //
@@ -308,28 +452,29 @@ int p2m_alloc_table(struct domain *d,
      struct page_info *page, *p2m_top;
      unsigned int page_count = 0;
      unsigned long gfn = -1UL;
+    struct p2m_domain *p2m = d->arch.p2m;
  
-    p2m_lock(d);
+    p2m_lock(p2m);
  
      if ( pagetable_get_pfn(d->arch.phys_table) != 0 )
      {
          P2M_ERROR("p2m already allocated for this domain\n");
-        p2m_unlock(d);
+        p2m_unlock(p2m);
          return -EINVAL;
      }
  
      P2M_PRINTK("allocating p2m table\n");
  
-    d->arch.p2m.alloc_page = alloc_page;
-    d->arch.p2m.free_page = free_page;
+    p2m->alloc_page = alloc_page;
+    p2m->free_page = free_page;
  
-    p2m_top = d->arch.p2m.alloc_page(d);
+    p2m_top = p2m->alloc_page(d);
      if ( p2m_top == NULL )
      {
-        p2m_unlock(d);
+        p2m_unlock(p2m);
          return -ENOMEM;
      }
-    list_add_tail(&p2m_top->list, &d->arch.p2m.pages);
+    list_add_tail(&p2m_top->list, &p2m->pages);
  
      p2m_top->count_info = 1;
      p2m_top->u.inuse.type_info =
@@ -376,13 +521,13 @@ int p2m_alloc_table(struct domain *d,
  #endif
  
      P2M_PRINTK("p2m table initialised (%u pages)\n", page_count);
-    p2m_unlock(d);
+    p2m_unlock(p2m);
      return 0;
  
   error:
      P2M_PRINTK("failed to initialize p2m table, gfn=%05lx, mfn=%"
                 PRI_mfn "\n", gfn, mfn_x(mfn));
-    p2m_unlock(d);
+    p2m_unlock(p2m);
      return -ENOMEM;
  }
  
@@ -392,101 +537,24 @@ void p2m_teardown(struct domain *d)
  {
      struct list_head *entry, *n;
      struct page_info *pg;
+    struct p2m_domain *p2m = d->arch.p2m;
  
-    p2m_lock(d);
+    p2m_lock(p2m);
      d->arch.phys_table = pagetable_null();
  
-    list_for_each_safe(entry, n, &d->arch.p2m.pages)
+    list_for_each_safe(entry, n, &p2m->pages)
      {
          pg = list_entry(entry, struct page_info, list);
          list_del(entry);
-        d->arch.p2m.free_page(d, pg);
+        p2m->free_page(d, pg);
      }
-    p2m_unlock(d);
+    p2m_unlock(p2m);
  }
  
-mfn_t
-gfn_to_mfn_foreign(struct domain *d, unsigned long gfn, p2m_type_t *t)
-/* Read another domain's p2m entries */
+void p2m_final_teardown(struct domain *d)
  {
-    mfn_t mfn;
-    paddr_t addr = ((paddr_t)gfn) << PAGE_SHIFT;
-    l2_pgentry_t *l2e;
-    l1_pgentry_t *l1e;
-
-    ASSERT(paging_mode_translate(d));
-
-    /* XXX This is for compatibility with the old model, where anything not 
-     * XXX marked as RAM was considered to be emulated MMIO space.
-     * XXX Once we start explicitly registering MMIO regions in the p2m 
-     * XXX we will return p2m_invalid for unmapped gfns */
-    *t = p2m_mmio_dm;
-
-    mfn = pagetable_get_mfn(d->arch.phys_table);
-
-    if ( gfn > d->arch.p2m.max_mapped_pfn )
-        /* This pfn is higher than the highest the p2m map currently holds */
-        return _mfn(INVALID_MFN);
-
-#if CONFIG_PAGING_LEVELS >= 4
-    {
-        l4_pgentry_t *l4e = map_domain_page(mfn_x(mfn));
-        l4e += l4_table_offset(addr);
-        if ( (l4e_get_flags(*l4e) & _PAGE_PRESENT) == 0 )
-        {
-            unmap_domain_page(l4e);
-            return _mfn(INVALID_MFN);
-        }
-        mfn = _mfn(l4e_get_pfn(*l4e));
-        unmap_domain_page(l4e);
-    }
-#endif
-#if CONFIG_PAGING_LEVELS >= 3
-    {
-        l3_pgentry_t *l3e = map_domain_page(mfn_x(mfn));
-#if CONFIG_PAGING_LEVELS == 3
-        /* On PAE hosts the p2m has eight l3 entries, not four (see
-         * shadow_set_p2m_entry()) so we can't use l3_table_offset.
-         * Instead, just count the number of l3es from zero.  It's safe
-         * to do this because we already checked that the gfn is within
-         * the bounds of the p2m. */
-        l3e += (addr >> L3_PAGETABLE_SHIFT);
-#else
-        l3e += l3_table_offset(addr);
-#endif
-        if ( (l3e_get_flags(*l3e) & _PAGE_PRESENT) == 0 )
-        {
-            unmap_domain_page(l3e);
-            return _mfn(INVALID_MFN);
-        }
-        mfn = _mfn(l3e_get_pfn(*l3e));
-        unmap_domain_page(l3e);
-    }
-#endif
-
-    l2e = map_domain_page(mfn_x(mfn));
-    l2e += l2_table_offset(addr);
-    if ( (l2e_get_flags(*l2e) & _PAGE_PRESENT) == 0 )
-    {
-        unmap_domain_page(l2e);
-        return _mfn(INVALID_MFN);
-    }
-    mfn = _mfn(l2e_get_pfn(*l2e));
-    unmap_domain_page(l2e);
-
-    l1e = map_domain_page(mfn_x(mfn));
-    l1e += l1_table_offset(addr);
-    if ( (l1e_get_flags(*l1e) & _PAGE_PRESENT) == 0 )
-    {
-        unmap_domain_page(l1e);
-        return _mfn(INVALID_MFN);
-    }
-    mfn = _mfn(l1e_get_pfn(*l1e));
-    *t = p2m_flags_to_type(l1e_get_flags(*l1e));
-    unmap_domain_page(l1e);
-
-    ASSERT(mfn_valid(mfn) || !p2m_is_ram(*t));
-    return (p2m_is_valid(*t)) ? mfn : _mfn(INVALID_MFN);
+    xfree(d->arch.p2m);
+    d->arch.p2m = NULL;
  }
  
  #if P2M_AUDIT
@@ -564,7 +632,7 @@ static void audit_p2m(struct domain *d)
              set_gpfn_from_mfn(mfn, INVALID_M2P_ENTRY);
          }
  
-        if ( test_linear && (gfn <= d->arch.p2m.max_mapped_pfn) )
+        if ( test_linear && (gfn <= d->arch.p2m->max_mapped_pfn) )
          {
              lp2mfn = mfn_x(gfn_to_mfn_current(gfn, &type));
              if ( lp2mfn != mfn_x(p2mfn) )
@@ -695,11 +763,11 @@ void
  guest_physmap_remove_page(struct domain *d, unsigned long gfn,
                            unsigned long mfn)
  {
-    p2m_lock(d);
+    p2m_lock(d->arch.p2m);
      audit_p2m(d);
      p2m_remove_page(d, gfn, mfn);
      audit_p2m(d);
-    p2m_unlock(d);
+    p2m_unlock(d->arch.p2m);
  }
  
  int
@@ -722,7 +790,7 @@ guest_physmap_add_entry(struct domain *d, unsigned long gfn,
       */
      if ( paging_mode_hap(d) && (gfn > 0xfffffUL) )
      {
-        if ( !test_and_set_bool(d->arch.hvm_domain.amd_npt_4gb_warning) )
+        if ( !test_and_set_bool(d->arch.hvm_domain.svm.npt_4gb_warning) )
              dprintk(XENLOG_WARNING, "Dom%d failed to populate memory beyond"
                      " 4GB: specify 'hap=0' domain config option.\n",
                      d->domain_id);
@@ -730,7 +798,7 @@ guest_physmap_add_entry(struct domain *d, unsigned long gfn,
      }
  #endif
  
-    p2m_lock(d);
+    p2m_lock(d->arch.p2m);
      audit_p2m(d);
  
      P2M_DEBUG("adding gfn=%#lx mfn=%#lx\n", gfn, mfn);
@@ -781,7 +849,7 @@ guest_physmap_add_entry(struct domain *d, unsigned long gfn,
      }
  
      audit_p2m(d);
-    p2m_unlock(d);
+    p2m_unlock(d->arch.p2m);
  
      return rc;
  }
@@ -812,7 +880,7 @@ void p2m_change_type_global(struct domain *d, p2m_type_t ot, p2m_type_t nt)
      if ( pagetable_get_pfn(d->arch.phys_table) == 0 )
          return;
  
-    p2m_lock(d);
+    p2m_lock(d->arch.p2m);
  
  #if CONFIG_PAGING_LEVELS == 4
      l4e = map_domain_page(mfn_x(pagetable_get_mfn(d->arch.phys_table)));
@@ -884,7 +952,7 @@ void p2m_change_type_global(struct domain *d, p2m_type_t ot, p2m_type_t nt)
      unmap_domain_page(l2e);
  #endif
  
-    p2m_unlock(d);
+    p2m_unlock(d->arch.p2m);
  }
  
  /* Modify the p2m type of a single gfn from ot to nt, returning the 
@@ -895,13 +963,13 @@ p2m_type_t p2m_change_type(struct domain *d, unsigned long gfn,
      p2m_type_t pt;
      mfn_t mfn;
  
-    p2m_lock(d);
+    p2m_lock(d->arch.p2m);
  
      mfn = gfn_to_mfn(d, gfn, &pt);
      if ( pt == ot )
          set_p2m_entry(d, gfn, mfn, nt);
  
-    p2m_unlock(d);
+    p2m_unlock(d->arch.p2m);
  
      return pt;
  }
diff --git a/xen/arch/x86/mm/paging.c b/xen/arch/x86/mm/paging.c

index f310c0facdd78ebd8d2e133f6ef3ea05d24a31dc..2247d8dd68752a1236c7d13618776db55236becc 100644 (file)
--- a/xen/arch/x86/mm/paging.c
+++ b/xen/arch/x86/mm/paging.c
@@ -484,9 +484,12 @@ void paging_log_dirty_teardown(struct domain*d)
  /*           CODE FOR PAGING SUPPORT            */
  /************************************************/
  /* Domain paging struct initialization. */
-void paging_domain_init(struct domain *d)
+int paging_domain_init(struct domain *d)
  {
-    p2m_init(d);
+    int rc;
+
+    if ( (rc = p2m_init(d)) != 0 )
+        return rc;
  
      /* The order of the *_init calls below is important, as the later
       * ones may rewrite some common fields.  Shadow pagetables are the
@@ -496,6 +499,8 @@ void paging_domain_init(struct domain *d)
      /* ... but we will use hardware assistance if it's available. */
      if ( hap_enabled(d) )
          hap_domain_init(d);
+
+    return 0;
  }
  
  /* vcpu paging struct initialization goes here */
@@ -589,6 +594,8 @@ void paging_final_teardown(struct domain *d)
          hap_final_teardown(d);
      else
          shadow_final_teardown(d);
+
+    p2m_final_teardown(d);
  }
  
  /* Enable an arbitrary paging-assistance mode.  Call once at domain
diff --git a/xen/common/domctl.c b/xen/common/domctl.c

index 52143dbd1db5b715d5e84b22d79064368ba0af90..e1c6e87ccfdf1be94bafce21e4507e8a3a3b1d11 100644 (file)
--- a/xen/common/domctl.c
+++ b/xen/common/domctl.c
@@ -25,6 +25,8 @@
  #include <public/domctl.h>
  #include <xsm/xsm.h>
  
+DEFINE_SPINLOCK(domctl_lock);
+
  extern long arch_do_domctl(
      struct xen_domctl *op, XEN_GUEST_HANDLE(xen_domctl_t) u_domctl);
  
@@ -180,7 +182,6 @@ long do_domctl(XEN_GUEST_HANDLE(xen_domctl_t) u_domctl)
  {
      long ret = 0;
      struct xen_domctl curop, *op = &curop;
-    static DEFINE_SPINLOCK(domctl_lock);
  
      if ( !IS_PRIV(current->domain) )
          return -EPERM;
diff --git a/xen/drivers/passthrough/vtd/iommu.c b/xen/drivers/passthrough/vtd/iommu.c

index b999b1eef2a0c3d93bc274f8403f1f8c2f7ffe42..aa27841b59daedc13ac3c8df795cd0d5e0ca73a3 100644 (file)
--- a/xen/drivers/passthrough/vtd/iommu.c
+++ b/xen/drivers/passthrough/vtd/iommu.c
@@ -23,6 +23,7 @@
  #include <xen/sched.h>
  #include <xen/xmalloc.h>
  #include <xen/domain_page.h>
+#include <asm/paging.h>
  #include <xen/iommu.h>
  #include <xen/numa.h>
  #include "iommu.h"
@@ -2057,9 +2058,42 @@ void iommu_set_pgd(struct domain *d)
      }
      p2m_table = mfn_x(pagetable_get_mfn(d->arch.phys_table));
  
-#if CONFIG_PAGING_LEVELS == 3
-    if ( !hd->pgd )
+    if ( paging_mode_hap(d) )
      {
+        int level = agaw_to_level(hd->agaw);
+        struct dma_pte *dpte = NULL;
+        mfn_t pgd_mfn;
+
+        switch ( level )
+        {
+        case VTD_PAGE_TABLE_LEVEL_3:
+            dpte = map_domain_page(p2m_table);
+            if ( !dma_pte_present(*dpte) )
+            {
+                gdprintk(XENLOG_ERR VTDPREFIX,
+                         "iommu_set_pgd: second level wasn't there\n");
+                unmap_domain_page(dpte);
+                return;
+            }
+            pgd_mfn = _mfn(dma_pte_addr(*dpte) >> PAGE_SHIFT_4K);
+            unmap_domain_page(dpte);
+            hd->pgd = maddr_to_virt(pagetable_get_paddr(
+                pagetable_from_mfn(pgd_mfn)));
+            break;
+        case VTD_PAGE_TABLE_LEVEL_4:
+            pgd_mfn = _mfn(p2m_table);
+            hd->pgd = maddr_to_virt(pagetable_get_paddr(
+                pagetable_from_mfn(pgd_mfn)));
+            break;
+        default:
+            gdprintk(XENLOG_ERR VTDPREFIX,
+                     "iommu_set_pgd:Unsupported p2m table sharing level!\n");
+            break;
+        }
+    }
+    else
+    {
+#if CONFIG_PAGING_LEVELS == 3
          int level = agaw_to_level(hd->agaw);
          struct dma_pte *pmd = NULL;
          struct dma_pte *pgd = NULL;
@@ -2125,10 +2159,7 @@ void iommu_set_pgd(struct domain *d)
          }
          unmap_domain_page(l3e);
          spin_unlock_irqrestore(&hd->mapping_lock, flags);
-    }
  #elif CONFIG_PAGING_LEVELS == 4
-    if ( !hd->pgd )
-    {
          int level = agaw_to_level(hd->agaw);
          l3_pgentry_t *l3e;
          mfn_t pgd_mfn;
@@ -2160,8 +2191,8 @@ void iommu_set_pgd(struct domain *d)
                       "iommu_set_pgd:Unsupported p2m table sharing level!\n");
              break;
          }
-    }
  #endif
+    }
      gdprintk(XENLOG_INFO VTDPREFIX,
               "iommu_set_pgd: hd->pgd = %p\n", hd->pgd);
  }
diff --git a/xen/include/asm-x86/domain.h b/xen/include/asm-x86/domain.h

index 42b8da7b36386f43601e430199230f2762ae8958..dad9f9723d013b3f725069dbbec9d55850b9d681 100644 (file)
--- a/xen/include/asm-x86/domain.h
+++ b/xen/include/asm-x86/domain.h
@@ -137,27 +137,6 @@ struct hap_domain {
      unsigned int      p2m_pages;    /* number of pages allocates to p2m */
  };
  
-/************************************************/
-/*       p2m handling                           */
-/************************************************/
-struct p2m_domain {
-    /* Lock that protects updates to the p2m */
-    spinlock_t         lock;
-    int                locker;   /* processor which holds the lock */
-    const char        *locker_function; /* Func that took it */
-
-    /* Pages used to construct the p2m */
-    struct list_head   pages;
-
-    /* Functions to call to get or free pages for the p2m */
-    struct page_info * (*alloc_page  )(struct domain *d);
-    void               (*free_page   )(struct domain *d,
-                                       struct page_info *pg);
-
-    /* Highest guest frame that's ever been mapped in the p2m */
-    unsigned long max_mapped_pfn;
-};
-
  /************************************************/
  /*       common paging data structure           */
  /************************************************/
@@ -208,6 +187,8 @@ struct paging_vcpu {
      struct shadow_vcpu shadow;
  };
  
+struct p2m_domain;
+
  struct arch_domain
  {
      l1_pgentry_t *mm_perdomain_pt;
@@ -232,7 +213,7 @@ struct arch_domain
      struct hvm_domain hvm_domain;
  
      struct paging_domain paging;
-    struct p2m_domain p2m ;
+    struct p2m_domain *p2m;
  
      /* Shadow translated domain: P2M mapping */
      pagetable_t phys_table;
diff --git a/xen/include/asm-x86/hvm/domain.h b/xen/include/asm-x86/hvm/domain.h

index 13618065746b6fa2630a1341cfe8bbace2ad8a02..0c23c7d9491780d08aa695f0ea5ac1c49b2a4a02 100644 (file)
--- a/xen/include/asm-x86/hvm/domain.h
+++ b/xen/include/asm-x86/hvm/domain.h
@@ -28,6 +28,8 @@
  #include <asm/hvm/vioapic.h>
  #include <asm/hvm/io.h>
  #include <xen/hvm/iommu.h>
+#include <asm/hvm/vmx/vmcs.h>
+#include <asm/hvm/svm/vmcb.h>
  #include <public/hvm/params.h>
  #include <public/hvm/save.h>
  
@@ -60,8 +62,6 @@ struct hvm_domain {
  
      uint64_t               params[HVM_NR_PARAMS];
  
-    unsigned long          vmx_apic_access_mfn;
-
      /* Memory ranges with pinned cache attributes. */
      struct list_head       pinned_cacheattr_ranges;
  
@@ -74,11 +74,13 @@ struct hvm_domain {
      /* Pass-through */
      struct hvm_iommu       hvm_iommu;
  
-#if CONFIG_PAGING_LEVELS == 3
-    bool_t                 amd_npt_4gb_warning;
-#endif
      bool_t                 hap_enabled;
      bool_t                 qemu_mapcache_invalidate;
+
+    union {
+        struct vmx_domain vmx;
+        struct svm_domain svm;
+    };
  };
  
  #endif /* __ASM_X86_HVM_DOMAIN_H__ */
diff --git a/xen/include/asm-x86/hvm/svm/vmcb.h b/xen/include/asm-x86/hvm/svm/vmcb.h

index 04fd5e0f12e2ae8a04040e028753ef07a1f1e988..55e98afabdbc642b37940c6b9891e411d5ec043c 100644 (file)
--- a/xen/include/asm-x86/hvm/svm/vmcb.h
+++ b/xen/include/asm-x86/hvm/svm/vmcb.h
@@ -444,6 +444,12 @@ struct vmcb_struct {
      u64 res16[301];
  } __attribute__ ((packed));
  
+struct svm_domain {
+#if CONFIG_PAGING_LEVELS == 3
+    bool_t npt_4gb_warning;
+#endif
+};
+
  struct arch_svm_struct {
      struct vmcb_struct *vmcb;
      u64    vmcb_pa;
diff --git a/xen/include/asm-x86/hvm/vmx/vmcs.h b/xen/include/asm-x86/hvm/vmx/vmcs.h

index 9ce2d2a38b6bfac503ea6b7d5595b5dd1b195a64..540cad74d48b72e6aa83974c40cba4c37bebcbd4 100644 (file)
--- a/xen/include/asm-x86/hvm/vmx/vmcs.h
+++ b/xen/include/asm-x86/hvm/vmx/vmcs.h
@@ -53,6 +53,23 @@ struct vmx_msr_state {
      unsigned long msrs[VMX_MSR_COUNT];
  };
  
+#define EPT_DEFAULT_MT      6
+#define EPT_DEFAULT_GAW     3
+
+struct vmx_domain {
+    unsigned long apic_access_mfn;
+
+    union {
+        struct {
+            u64 etmt :3,
+                gaw  :3,
+                rsvd :6,
+                asr  :52;
+        };
+        u64 eptp;
+    } ept_control;
+};
+
  struct arch_vmx_struct {
      /* Virtual address of VMCS. */
      struct vmcs_struct  *vmcs;
@@ -71,6 +88,7 @@ struct arch_vmx_struct {
  
      /* Cache of cpu execution control. */
      u32                  exec_control;
+    u32                  secondary_exec_control;
  
      /* PMU */
      struct vpmu_struct   vpmu;
@@ -108,6 +126,8 @@ void vmx_vmcs_exit(struct vcpu *v);
  #define CPU_BASED_MWAIT_EXITING               0x00000400
  #define CPU_BASED_RDPMC_EXITING               0x00000800
  #define CPU_BASED_RDTSC_EXITING               0x00001000
+#define CPU_BASED_CR3_LOAD_EXITING            0x00008000
+#define CPU_BASED_CR3_STORE_EXITING           0x00010000
  #define CPU_BASED_CR8_LOAD_EXITING            0x00080000
  #define CPU_BASED_CR8_STORE_EXITING           0x00100000
  #define CPU_BASED_TPR_SHADOW                  0x00200000
@@ -136,6 +156,7 @@ extern u32 vmx_vmexit_control;
  extern u32 vmx_vmentry_control;
  
  #define SECONDARY_EXEC_VIRTUALIZE_APIC_ACCESSES 0x00000001
+#define SECONDARY_EXEC_ENABLE_EPT               0x00000002
  #define SECONDARY_EXEC_WBINVD_EXITING           0x00000040
  extern u32 vmx_secondary_exec_control;
  
@@ -151,6 +172,10 @@ extern bool_t cpu_has_vmx_ins_outs_instr_info;
      (vmx_pin_based_exec_control & PIN_BASED_VIRTUAL_NMIS)
  #define cpu_has_vmx_msr_bitmap \
      (vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_MSR_BITMAP)
+#define cpu_has_vmx_secondary_exec_control \
+    (vmx_cpu_based_exec_control & CPU_BASED_ACTIVATE_SECONDARY_CONTROLS)
+#define cpu_has_vmx_ept \
+    (vmx_secondary_exec_control & SECONDARY_EXEC_ENABLE_EPT)
  
  /* GUEST_INTERRUPTIBILITY_INFO flags. */
  #define VMX_INTR_SHADOW_STI             0x00000001
@@ -192,11 +217,23 @@ enum vmcs_field {
      VIRTUAL_APIC_PAGE_ADDR          = 0x00002012,
      VIRTUAL_APIC_PAGE_ADDR_HIGH     = 0x00002013,
      APIC_ACCESS_ADDR                = 0x00002014,
-    APIC_ACCESS_ADDR_HIGH           = 0x00002015, 
+    APIC_ACCESS_ADDR_HIGH           = 0x00002015,
+    EPT_POINTER                     = 0x0000201a,
+    EPT_POINTER_HIGH                = 0x0000201b,
+    GUEST_PHYSICAL_ADDRESS          = 0x00002400,
+    GUEST_PHYSICAL_ADDRESS_HIGH     = 0x00002401,
      VMCS_LINK_POINTER               = 0x00002800,
      VMCS_LINK_POINTER_HIGH          = 0x00002801,
      GUEST_IA32_DEBUGCTL             = 0x00002802,
      GUEST_IA32_DEBUGCTL_HIGH        = 0x00002803,
+    GUEST_PDPTR0                    = 0x0000280a,
+    GUEST_PDPTR0_HIGH               = 0x0000280b,
+    GUEST_PDPTR1                    = 0x0000280c,
+    GUEST_PDPTR1_HIGH               = 0x0000280d,
+    GUEST_PDPTR2                    = 0x0000280e,
+    GUEST_PDPTR2_HIGH               = 0x0000280f,
+    GUEST_PDPTR3                    = 0x00002810,
+    GUEST_PDPTR3_HIGH               = 0x00002811,
      PIN_BASED_VM_EXEC_CONTROL       = 0x00004000,
      CPU_BASED_VM_EXEC_CONTROL       = 0x00004002,
      EXCEPTION_BITMAP                = 0x00004004,
diff --git a/xen/include/asm-x86/hvm/vmx/vmx.h b/xen/include/asm-x86/hvm/vmx/vmx.h

index 1d2f37d4d1e3acf0fc91b0c26420053059f16980..a873c343a1fe70b029db00890c18642aae9b374e 100644 (file)
--- a/xen/include/asm-x86/hvm/vmx/vmx.h
+++ b/xen/include/asm-x86/hvm/vmx/vmx.h
@@ -23,9 +23,27 @@
  #include <asm/types.h>
  #include <asm/regs.h>
  #include <asm/processor.h>
-#include <asm/hvm/vmx/vmcs.h>
  #include <asm/i387.h>
+#include <asm/hvm/support.h>
  #include <asm/hvm/trace.h>
+#include <asm/hvm/vmx/vmcs.h>
+
+typedef union {
+    struct {
+        u64 r       :   1,
+        w           :   1,
+        x           :   1,
+        emt         :   4,
+        sp_avail    :   1,
+        avail1      :   4,
+        mfn         :   45,
+        rsvd        :   5,
+        avail2      :   2;
+    };
+    u64 epte;
+} ept_entry_t;
+
+#define EPT_TABLE_ORDER     9
  
  void vmx_asm_vmexit_handler(struct cpu_user_regs);
  void vmx_asm_do_vmentry(void);
@@ -80,6 +98,8 @@ void vmx_realmode(struct cpu_user_regs *regs);
  #define EXIT_REASON_MACHINE_CHECK       41
  #define EXIT_REASON_TPR_BELOW_THRESHOLD 43
  #define EXIT_REASON_APIC_ACCESS         44
+#define EXIT_REASON_EPT_VIOLATION       48
+#define EXIT_REASON_EPT_MISCONFIG       49
  #define EXIT_REASON_WBINVD              54
  
  /*
@@ -143,12 +163,14 @@ void vmx_realmode(struct cpu_user_regs *regs);
  #define VMREAD_OPCODE   ".byte 0x0f,0x78\n"
  #define VMRESUME_OPCODE ".byte 0x0f,0x01,0xc3\n"
  #define VMWRITE_OPCODE  ".byte 0x0f,0x79\n"
+#define INVEPT_OPCODE   ".byte 0x66,0x0f,0x38,0x80\n"   /* m128,r64/32 */
  #define VMXOFF_OPCODE   ".byte 0x0f,0x01,0xc4\n"
  #define VMXON_OPCODE    ".byte 0xf3,0x0f,0xc7\n"
  
+#define MODRM_EAX_08    ".byte 0x08\n" /* ECX, [EAX] */
  #define MODRM_EAX_06    ".byte 0x30\n" /* [EAX], with reg/opcode: /6 */
  #define MODRM_EAX_07    ".byte 0x38\n" /* [EAX], with reg/opcode: /7 */
-#define MODRM_EAX_ECX   ".byte 0xc1\n" /* [EAX], [ECX] */
+#define MODRM_EAX_ECX   ".byte 0xc1\n" /* EAX, ECX */
  
  static inline void __vmptrld(u64 addr)
  {
@@ -232,6 +254,31 @@ static inline void __vm_clear_bit(unsigned long field, unsigned int bit)
      __vmwrite(field, __vmread(field) & ~(1UL << bit));
  }
  
+static inline void __invept(int ext, u64 eptp, u64 gpa)
+{
+    struct {
+        u64 eptp, gpa;
+    } operand = {eptp, gpa};
+
+    __asm__ __volatile__ ( INVEPT_OPCODE
+                           MODRM_EAX_08
+                           /* CF==1 or ZF==1 --> rc = -1 */
+                           "ja 1f ; ud2 ; 1:\n"
+                           :
+                           : "a" (&operand), "c" (ext)
+                           : "memory");
+}
+
+static inline void ept_sync_all(void)
+{
+    if ( !current->domain->arch.hvm_domain.hap_enabled )
+        return;
+
+    __invept(2, 0, 0);
+}
+
+void ept_sync_domain(struct domain *d);
+
  static inline void __vmxoff(void)
  {
      asm volatile (
@@ -265,4 +312,6 @@ void vmx_inject_hw_exception(struct vcpu *v, int trap, int error_code);
  void vmx_inject_extint(struct vcpu *v, int trap);
  void vmx_inject_nmi(struct vcpu *v);
  
+void ept_p2m_init(struct domain *d);
+
  #endif /* __ASM_X86_HVM_VMX_VMX_H__ */
diff --git a/xen/include/asm-x86/p2m.h b/xen/include/asm-x86/p2m.h

index 38a9cec7bdb6077580d105f5f9aa082609fa36e8..1825f0a5d2060c94ba11975557a025617e14654e 100644 (file)
--- a/xen/include/asm-x86/p2m.h
+++ b/xen/include/asm-x86/p2m.h
@@ -26,6 +26,8 @@
  #ifndef _XEN_P2M_H
  #define _XEN_P2M_H
  
+#include <xen/config.h>
+#include <xen/paging.h>
  
  /*
   * The phys_to_machine_mapping maps guest physical frame numbers 
@@ -86,54 +88,49 @@ typedef enum {
  #define p2m_is_readonly(_t) (p2m_to_mask(_t) & P2M_RO_TYPES)
  #define p2m_is_valid(_t) (p2m_to_mask(_t) & (P2M_RAM_TYPES | P2M_MMIO_TYPES))
  
+struct p2m_domain {
+    /* Lock that protects updates to the p2m */
+    spinlock_t         lock;
+    int                locker;   /* processor which holds the lock */
+    const char        *locker_function; /* Func that took it */
+
+    /* Pages used to construct the p2m */
+    struct list_head   pages;
+
+    /* Functions to call to get or free pages for the p2m */
+    struct page_info * (*alloc_page  )(struct domain *d);
+    void               (*free_page   )(struct domain *d,
+                                       struct page_info *pg);
+    int                (*set_entry   )(struct domain *d, unsigned long gfn,
+                                       mfn_t mfn, p2m_type_t p2mt);
+    mfn_t              (*get_entry   )(struct domain *d, unsigned long gfn,
+                                       p2m_type_t *p2mt);
+    mfn_t              (*get_entry_current)(unsigned long gfn,
+                                            p2m_type_t *p2mt);
+
+    /* Highest guest frame that's ever been mapped in the p2m */
+    unsigned long max_mapped_pfn;
+};
+
  /* Extract the type from the PTE flags that store it */
  static inline p2m_type_t p2m_flags_to_type(unsigned long flags)
  {
      /* Type is stored in the "available" bits, 9, 10 and 11 */
      return (flags >> 9) & 0x7;
  }
- 
-/* Read the current domain's p2m table (through the linear mapping). */
+
+/* Read the current domain's p2m table. */
  static inline mfn_t gfn_to_mfn_current(unsigned long gfn, p2m_type_t *t)
  {
-    mfn_t mfn = _mfn(INVALID_MFN);
-    p2m_type_t p2mt = p2m_mmio_dm;
-    /* XXX This is for compatibility with the old model, where anything not 
-     * XXX marked as RAM was considered to be emulated MMIO space.
-     * XXX Once we start explicitly registering MMIO regions in the p2m 
-     * XXX we will return p2m_invalid for unmapped gfns */
-
-    if ( gfn <= current->domain->arch.p2m.max_mapped_pfn )
-    {
-        l1_pgentry_t l1e = l1e_empty();
-        int ret;
-
-        ASSERT(gfn < (RO_MPT_VIRT_END - RO_MPT_VIRT_START) 
-               / sizeof(l1_pgentry_t));
-
-        /* Need to __copy_from_user because the p2m is sparse and this
-         * part might not exist */
-        ret = __copy_from_user(&l1e,
-                               &phys_to_machine_mapping[gfn],
-                               sizeof(l1e));
-
-        if ( ret == 0 ) {
-            p2mt = p2m_flags_to_type(l1e_get_flags(l1e));
-            ASSERT(l1e_get_pfn(l1e) != INVALID_MFN || !p2m_is_ram(p2mt));
-            if ( p2m_is_valid(p2mt) )
-                mfn = _mfn(l1e_get_pfn(l1e));
-            else 
-                /* XXX see above */
-                p2mt = p2m_mmio_dm;
-        }
-    }
-
-    *t = p2mt;
-    return mfn;
+    return current->domain->arch.p2m->get_entry_current(gfn, t);
  }
  
  /* Read another domain's P2M table, mapping pages as we go */
-mfn_t gfn_to_mfn_foreign(struct domain *d, unsigned long gfn, p2m_type_t *t);
+static inline
+mfn_t gfn_to_mfn_foreign(struct domain *d, unsigned long gfn, p2m_type_t *t)
+{
+    return d->arch.p2m->get_entry(d, gfn, t);
+}
  
  /* General conversion function from gfn to mfn */
  #define gfn_to_mfn(d, g, t) _gfn_to_mfn((d), (g), (t))
@@ -149,7 +146,7 @@ static inline mfn_t _gfn_to_mfn(struct domain *d,
      }
      if ( likely(current->domain == d) )
          return gfn_to_mfn_current(gfn, t);
-    else 
+    else
          return gfn_to_mfn_foreign(d, gfn, t);
  }
  
@@ -185,7 +182,7 @@ gl1e_to_ml1e(struct domain *d, l1_pgentry_t l1e)
  
  
  /* Init the datastructures for later use by the p2m code */
-void p2m_init(struct domain *d);
+int p2m_init(struct domain *d);
  
  /* Allocate a new p2m table for a domain. 
   *
@@ -199,6 +196,7 @@ int p2m_alloc_table(struct domain *d,
  
  /* Return all the p2m resources to Xen. */
  void p2m_teardown(struct domain *d);
+void p2m_final_teardown(struct domain *d);
  
  /* Add a page to a domain's p2m table */
  int guest_physmap_add_entry(struct domain *d, unsigned long gfn,
diff --git a/xen/include/asm-x86/paging.h b/xen/include/asm-x86/paging.h

index 4c7e6f1327ff3a2bbc854e2861cd6924db987857..f7512a7b26206d627f9634cab7b798e609a35001 100644 (file)
--- a/xen/include/asm-x86/paging.h
+++ b/xen/include/asm-x86/paging.h
@@ -183,7 +183,7 @@ void paging_vcpu_init(struct vcpu *v);
  
  /* Set up the paging-assistance-specific parts of a domain struct at
   * start of day.  Called for every domain from arch_domain_create() */
-void paging_domain_init(struct domain *d);
+int paging_domain_init(struct domain *d);
  
  /* Handler for paging-control ops: operations from user-space to enable
   * and disable ephemeral shadow modes (test mode and log-dirty mode) and
diff --git a/xen/include/public/hvm/params.h b/xen/include/public/hvm/params.h

index aad2196db3ab0f56c00d24f5031c1c3247ef4aff..554309f49b6e23225d5c8eb257413e438a5fba36 100644 (file)
--- a/xen/include/public/hvm/params.h
+++ b/xen/include/public/hvm/params.h
@@ -83,7 +83,8 @@
  
  /* Boolean: Enable virtual HPET (high-precision event timer)? (x86-only) */
  #define HVM_PARAM_HPET_ENABLED 11
+#define HVM_PARAM_IDENT_PT     12
  
-#define HVM_NR_PARAMS          12
+#define HVM_NR_PARAMS          13
  
  #endif /* __XEN_PUBLIC_HVM_PARAMS_H__ */
diff --git a/xen/include/xen/hypercall.h b/xen/include/xen/hypercall.h

index 5313b9a1d8fda8924cbd99cc6676e25a6e6aa04c..7d58109ec2d4d9294b8323eac173268c04a9e945 100644 (file)
--- a/xen/include/xen/hypercall.h
+++ b/xen/include/xen/hypercall.h
@@ -30,6 +30,7 @@ do_sched_op(
      int cmd,
      XEN_GUEST_HANDLE(void) arg);
  
+extern spinlock_t domctl_lock;
  extern long
  do_domctl(
      XEN_GUEST_HANDLE(xen_domctl_t) u_domctl);
author	Keir Fraser <keir.fraser@citrix.com>
	Wed, 9 Apr 2008 10:30:32 +0000 (11:30 +0100)
committer	Keir Fraser <keir.fraser@citrix.com>
	Wed, 9 Apr 2008 10:30:32 +0000 (11:30 +0100)
tools/libxc/xc_hvm_build.c		patch \| blob \| history
xen/arch/x86/domain.c		patch \| blob \| history
xen/arch/x86/hvm/hvm.c		patch \| blob \| history
xen/arch/x86/hvm/vmx/vmcs.c		patch \| blob \| history
xen/arch/x86/hvm/vmx/vmx.c		patch \| blob \| history
xen/arch/x86/mm.c		patch \| blob \| history
xen/arch/x86/mm/hap/Makefile		patch \| blob \| history
xen/arch/x86/mm/hap/p2m-ept.c	[new file with mode: 0644]	patch \| blob
xen/arch/x86/mm/p2m.c		patch \| blob \| history
xen/arch/x86/mm/paging.c		patch \| blob \| history
xen/common/domctl.c		patch \| blob \| history
xen/drivers/passthrough/vtd/iommu.c		patch \| blob \| history
xen/include/asm-x86/domain.h		patch \| blob \| history
xen/include/asm-x86/hvm/domain.h		patch \| blob \| history
xen/include/asm-x86/hvm/svm/vmcb.h		patch \| blob \| history
xen/include/asm-x86/hvm/vmx/vmcs.h		patch \| blob \| history
xen/include/asm-x86/hvm/vmx/vmx.h		patch \| blob \| history
xen/include/asm-x86/p2m.h		patch \| blob \| history
xen/include/asm-x86/paging.h		patch \| blob \| history
xen/include/public/hvm/params.h		patch \| blob \| history
xen/include/xen/hypercall.h		patch \| blob \| history